All the group member participated in all the two assignments, and after discussion, formed this report.
#plot word cloud corresponding to Five
#read the data line by line
fivetext=read_lines("Five.txt")
#create a data frame and add a new column with row numbers
fivetextFrame=tibble(text=fivetext)%>%mutate(line = row_number())
#create a color palette containing 6 colors
pal <- brewer.pal(6,"Dark2")
#split text column into tokens,one-token-per-row
#return all rows without stop words
#count the frequencies of all the remaining words
#plot a word cloud using wordcloud function
fivetidy_frame=fivetextFrame%>%unnest_tokens(word, text)%>%
anti_join(stop_words)%>%
count(word)%>%
with(wordcloud(word, n, max.words = 100, colors=pal, random.order=F))
#plot word cloud corresponding to OneTwo using the same way
onetwotext=read_lines("OneTwo.txt")
onetwotextFrame=tibble(text=onetwotext)%>%mutate(line = row_number())
pal <- brewer.pal(6,"Dark2")
onetwotidy_frame=onetwotextFrame%>%unnest_tokens(word, text)%>%
anti_join(stop_words)%>%
count(word)%>%
with(wordcloud(word, n, max.words = 100, colors=pal, random.order=F))
Which words are mentioned most often?
By observing the plots, “watch” “casio” “time” are mentioned most often in both two plots.
#aggregating each 10 lines into a separate “document” using OneTwo data.
onetwotidy_frame2=onetwotextFrame%>%unnest_tokens(word, text)%>%
mutate(line1=floor(line/10))%>%
count(line1,word, sort=TRUE)
#compute mean TF-IDF values
TFIDF=onetwotidy_frame2%>%bind_tf_idf(word, line1, n)
#get the unique words
words=unique(TFIDF$word)
tf_idfmean=c()
#compute mean TF-IDF values for each word over all documents
for(w in words){
n=which(words==w)
tf_idfmean[n]=mean(TFIDF[TFIDF$word==w,]$tf_idf)
}
TF_IDFmean=data.frame(word=words,tf_idfmean=tf_idfmean)
#plot word cloud
TF_IDFmean%>%with(wordcloud(words=word,
freq=tf_idfmean,
scale=c(1,.5),
max.words = 100,
colors=pal,
random.order=F))
#print the rows containing the word "watch"
print(TF_IDFmean[TF_IDFmean$word=="watch",])
## word tf_idfmean
## 7 watch 0
print(TFIDF[TFIDF$word=="watch",])
## # A tibble: 11 × 6
## line1 word n tf idf tf_idf
## <dbl> <chr> <int> <dbl> <dbl> <dbl>
## 1 6 watch 20 0.0321 0 0
## 2 1 watch 13 0.0227 0 0
## 3 3 watch 13 0.0423 0 0
## 4 0 watch 12 0.0182 0 0
## 5 2 watch 12 0.0391 0 0
## 6 4 watch 12 0.0191 0 0
## 7 5 watch 12 0.0194 0 0
## 8 8 watch 10 0.0229 0 0
## 9 9 watch 10 0.0281 0 0
## 10 7 watch 7 0.0179 0 0
## 11 10 watch 1 0.0625 0 0
Comparing the plot with the corresponding plot from question1, it shows that the words mentioned most often are changed.The word “watch” is not emphasized in the TF-IDF diagram. As when drawing this word cloud, the frequency of words using mean tf_idf value. According to the definition of tf_idf, tf_idf = tf(term frequency)* idf(inverse document frequency). The idf is defined as: \(idf=ln(n_{documents}/n_{documents-containing-term})\). Based on the results above using print function, it shows every document has the word “watch”. So the idf is zero, thus the tf_idf value is zero. Hence, the word “watch” is not emphasized in the TF-IDF diagram. However, in the previous plot, tf value is used to plot the word cloud. With this high word frequency, the word “watch” is emphasized in the previous plot.
#use Five dataset to plot the aggregated sentiment values versus chunk index
fivesentiment_tidy_frame=fivetextFrame%>%
unnest_tokens(word, text)%>% #Tokenize the text
left_join(get_sentiments("afinn"))%>%#get the sentiment values base on "afinn" lexicon and add them in a new column
mutate(line1=floor(line/5))%>%#create a new column named "line1", the value is the chunk index
group_by(line1, sort=TRUE)%>%#group by line1
summarize(Sentiment=sum(value, na.rm = T))#get the sum sentiment values per chunk index
# plot the aggregated sentiment values versus chunk index
plot_ly(fivesentiment_tidy_frame, x=~line1, y=~Sentiment)%>%
add_bars()%>%# Bar Charts
layout(title = "the aggregated sentiment values using Five ",
xaxis = list(title = 'chunk'))
#use OneTwo dataset to plot the aggregated sentiment values versus chunk index using the same way
onetwosentiment_tidy_frame=onetwotextFrame%>%unnest_tokens(word, text)%>%
left_join(get_sentiments("afinn"))%>%
mutate(line1=floor(line/5))%>%
group_by(line1, sort=TRUE)%>%
summarize(Sentiment=sum(value, na.rm = T))
plot_ly(onetwosentiment_tidy_frame, x=~line1, y=~Sentiment)%>%
add_bars()%>%
layout(title = "the aggregated sentiment values using OneTwo ",
xaxis = list(title = 'chunk'))
Make a comparative analysis between these plots. Does sentiment analysis show a connection of the corresponding documents to the kinds of reviews we expect to see in them?
According to the description about Five file, this file contains feedback of customers who were pleased with their purchase. By observing the sentiment plot, all the sentiment values are positive(with the highest value = 60) which means they are satisfied with their watches. It shows a connection of corresponding documents to the kinds of reviews we expect to see.
Based on the description about OneTwo file, this file contains feedback of customers who were not pleased with their purchase. By observing the sentiment plot, it contains both negative and positive values(with the max = 29, min =-22), but with much more positive sentiment values(12 postive, 7 negative). In this case, it is hard to say a connection of corresponding documents to the kinds of reviews we expect to see.
Create the phrase nets for Five.Txt and One.Txt with connector words
am, is, are, was, were
at
#use the codes from the template
phraseNet=function(text, connectors){
textFrame=tibble(text=paste(text, collapse=" "))#return a 1*1 dataframe contains the whole article in it
tidy_frame3=textFrame%>%unnest_tokens(word, text, token="ngrams", n=3)#Tokenize the text, three words per cell
tidy_frame3
tidy_frame_sep=tidy_frame3%>%separate(word, c("word1", "word2", "word3"), sep=" ")#separate "word" column into three columns named "word1", "word2", "word3", one word per cell
#SELECT SEPARATION WORDS HERE: now "is"/"are"
tidy_frame_filtered=tidy_frame_sep%>%
filter(word2 %in% connectors)%>%# filter from word2 column contains connectors
filter(!word1 %in% stop_words$word)%>%# filter from word1 column not contains stop words
filter(!word3 %in% stop_words$word)# filter from word3 column not contains stop words
tidy_frame_filtered
edges=tidy_frame_filtered%>%
count(word1,word3, sort = T)%>%#count the values base on "word1","word3"
rename(from=word1, to=word3, width=n)%>%#rename "n" column using "width"
mutate(arrows="to")#create a new column "arrows", all the values in this column are "to"
right_words=edges%>%count(word=to, wt=width)#computes sum(width) for each word in "to"column
left_words=edges%>%count(word=from, wt=width)#computes sum(width) for each word in "from"column
#Computing node sizes and in/out degrees, colors.
nodes=left_words%>%
full_join(right_words, by="word")%>%#join the two data frame by "word"
replace_na(list(n.x=0, n.y=0))%>%#Replace NAs
mutate(n.total=n.x+n.y)%>%#create a columns to calculate n.x+n.y
mutate(n.out=n.x-n.y)%>%#create a columns to calculate n.x-n.y
mutate(id=word, color=brewer.pal(9, "Blues")[cut_interval(n.out,9)], font.size=40)%>%
rename(label=word, value=n.total)
#FILTERING edges with no further connections - can be commented
edges=edges%>%left_join(nodes, c("from"= "id"))%>%
left_join(nodes, c("to"="id"))%>%
filter(value.x>1|value.y>1)%>%
select(from,to,width,arrows)
nodes=nodes%>%filter(id %in% edges$from |id %in% edges$to )
visNetwork(nodes,edges)
}
text1=read_lines("Five.txt")
text2=read_lines("OneTwo.txt")
#Create the phrase nets these two datasets using different connector words :
connector_words1=c("am", "is", "are", "was", "were")
phraseNet_Five1=phraseNet(text1,connector_words1)
phraseNet_Five1
phraseNet_OneTwo1=phraseNet(text2,connector_words1)
phraseNet_OneTwo1
#change connector words
connector_words2=c("at")
phraseNet_Five2=phraseNet(text1,connector_words2)
phraseNet_Five2
phraseNet_OneTwo2=phraseNet(text2,connector_words2)
phraseNet_OneTwo2
#show the word tree using Five file(only remain the watch part)
knitr::include_graphics("Five_wordtree.png")
#show the word tree using OneTwo file
knitr::include_graphics("OneTwo_wordtree.png")
According to the word trees and phrase nets,the mostly often mentioned properties are: extremely accurate, cheap,huge,durable and water resistant.
What are satisfied customers talking about?
They are talking about the cheap price,the accurate time ,the huge size and water resistant.
What are unsatisfied customers talking about?
Although it has a lot of good reviews, the watch is not so accurate and always stops.
What are properties of the watch mentioned by both groups?
The two groups both mention the cheap price and water resistant.
Can you understand watch characteristics (like size of display, features of the watches) by observing these graphs?
By observing the plots, we can understand some watch characteristics. We can summarize it like this: The size of this watch is huge. This watch has a rubber strap. The materials are durable. The watch glows at night and the time is readable.
In this assignment we continue analyzing data olive.csv that you started working with in lab 2. These data contain information about contents of olive oils coming from different regions in Italy.
#Q21
#read the data and preprocess it
olive<-read.csv("olive.csv", row.names=1)
##Create a new column that contains the Region Name
olive <- olive %>%
mutate(Region_Name = case_when(
Region == 1 ~ "North",
Region == 2 ~ "South",
Region == 3 ~ "Sardinia island"
)
)
#create interactive scatter plot
q21_plot <- olive %>% plot_ly(x=~linoleic, y=~eicosenoic,
text = ~paste("Eicosenoic value:", eicosenoic), #Show the Eicosenoic value when hover
type = "scatter",
mode = "markers") %>%
layout(title = "Scatter Plot of Eicosenoic vs. Linoleic",
xaxis = list(title = "Linoleic"),
yaxis = list(title = "Eicosenoic")
)
q21_plot
Link the scatterplot of (eicosenoic, linoleic) to a bar chart showing Region and a slider that allows to filter the data by the values of stearic.
Use persistent brushing to identify the regions that correspond unusually low values of eicosenoic.
Use the slider and describe what additional relationships in the data can be found by using it.
Report which interaction operators were used in this step.
#Q22
#Template from course website
d <- SharedData$new(olive) # This is needed for crosstalk
scatterCrab_q22 <- d %>%
plot_ly(x=~linoleic, y=~eicosenoic,
text = ~paste("Eicosenoic value:", eicosenoic), #This will show the eicosenoic value when hover
type = "scatter",
mode = "markers")%>%
layout(title = "Eicosenoic vs. Linoleic",
xaxis = list(title = "Linoleic"),
yaxis = list(title = "Eicosenoic"))
#From course template
barCrab_q22 <-plot_ly(d, x=~Region_Name)%>%add_histogram()%>%layout(barmode="overlay")
#Create linked plot
bscols(filter_slider("stearic", "Stearic", d, ~stearic),
subplot(scatterCrab_q22,barCrab_q22,
widths = c(0.7, 0.3), #Let scatter plot have more space
titleY = TRUE, titleX=TRUE) %>% #show axis
highlight (on = "plotly_selected", #use selected to select multiple data point at once
off = "plotly_deselect", #add off action to avoid warnings
dynamic = T, persistent = T, opacityDim = I(1))%>%
hide_legend(),
widths=c(4, 8)) #the width of slider and subplot
#Q23
#Create 2 scatter plots, similar to part 2
scatterCrab_23_eicosenoic <- d %>%
plot_ly(x=~linoleic, y=~eicosenoic,
text = ~paste("Eicosenoic value:", eicosenoic),
type = "scatter",
mode = "markers")%>%
layout(title = "Eicosenoic vs. Linoleic",
xaxis = list(title = "Linoleic"),
yaxis = list(title = "Eicosenoic"))
scatterCrab_23_arachidic <- d %>%
plot_ly(x=~linolenic, y=~arachidic,
type = "scatter",
mode = "markers")%>%
layout(title = "Arachidic vs. Linolenic",
xaxis = list(title = "Linolenic"),
yaxis = list(title = "Arachidic"))
bscols(subplot(scatterCrab_23_eicosenoic, scatterCrab_23_arachidic, titleY = TRUE, titleX=TRUE) %>%
layout(title="") %>%
highlight(
on = "plotly_selected", #use selected to select multiple data point at once
off = "plotly_deselect",
dynamic = T, persistent = T, opacityDim = I(1)
)%>%
hide_legend())
Figure 2-3
In above “Figure 2-3”, we checked three groups that we consider as
outliers.
First, the red data points, these are observations that have low
Linolenic value(<10) on “Linolenic vs Arachidic plot”, they
correspond to low Eicosenoic value(<5) on “Linoleic vs Eicosenoic
plot”.
Second, green data points, these are observations that have mid
Linolenic value(~50) but low Arachidic value(<20), they are also
observations with low Eicosenoic value(<5) on “Linoleic vs Eicosenoic
plot”.
Third, purple data points, these are two observations that have the
highest Linolenic value(>65). However, they are scatter in “Linoleic
vs Eicosenoic plot”, one in the main cluster and the other in the lower
Eicosenoic value cluster .
In conclusion, The red and green outliers on “Linolenic vs Arachidic plot” are also outliers in “Linoleic vs Eicosenoic plot”. However, the purple outliers for “Linoleic vs Arachidic plot” does not group in the same way on “Linoleic vs Eicosenoic plot”.
#Q24
#Code modified from course website
#Parallel plot
p<-ggparcoord(olive, columns = c(3:10)) #Choose the column of acids
q24_plotly_data<-plotly_data(ggplotly(p))%>%group_by(.ID)
d1<-SharedData$new(q24_plotly_data, key =~.ID, group="q24") #Key and Group are crucial for linking the plots
parallel_plot<-plot_ly(d1, x=~variable, y=~value)%>%
add_lines(line=list(width=0.3))%>%
add_markers(marker=list(size=0.3),
text=~.ID, hoverinfo="text")
olive2=olive #Create new data frame so we don't modified on the original one
olive2$.ID=1:nrow(olive) #add .ID coulumn
d2<-SharedData$new(olive2, ~.ID, group="q24") #use .ID as grouping identifier
## Variable selection, from course website
ButtonsX=list()
for (i in 3:10){ #modified the indices so we choose the acid columns
ButtonsX[[i-2]]= list(method = "restyle",
args = list( "x", list(olive2[[i]])),
label = colnames(olive2)[i])
}
ButtonsY=list()
for (i in 3:10){
ButtonsY[[i-2]]= list(method = "restyle",
args = list( "y", list(olive2[[i]])),
label = colnames(olive2)[i])
}
ButtonsZ=list()
for (i in 3:10){
ButtonsZ[[i-2]]= list(method = "restyle",
args = list( "z", list(olive2[[i]])),
label = colnames(olive2)[i])
}
## 3D plot, from course website
three_d_plot <- plot_ly(d2,x=~palmitic,y=~palmitic,z=~palmitic) %>%#set the default box as palmitic on all axis
add_markers() %>%
layout(scene=list(xaxis=list(title="x"), yaxis=list(title="y"), zaxis=list(title="z")),title = "",
updatemenus = list(
list(y=0.9, buttons = ButtonsX), #y is the position of drop boxes
list(y=0.6, buttons = ButtonsY),
list(y=0.3, buttons = ButtonsZ)
) )
histogram_plot <- plot_ly(d2, x=~as.factor(Region_Name)) %>% add_histogram() %>%
layout(barmode="overlay", xaxis = list(title="Region"))
bscols(
parallel_plot %>% highlight(
on = "plotly_selected",
off = "plotly_deselect",
dynamic = TRUE,
persistent = TRUE,
opacityDim = I(1)
) %>%
hide_legend(),
three_d_plot %>% highlight(
on = "plotly_click",
off = "plotly_doubleclick",
dynamic = TRUE,
persistent = TRUE
) %>%
hide_legend(),
histogram_plot %>% highlight(
on = "plotly_click",
off = "plotly_doubleclick",
dynamic = TRUE,
persistent = TRUE
),
widths=c(4,4,4)) #width for each subplot
Figure 2-4-1- Parallel coordinate plot: colored by Region
Figure 2-4-2- 3D plot:: colored by Region
“Figure 2-4-1” and “Figure 2-4-2” are both colored by Region,
where red is South, green is Sardinia island and blue is North.
By observing the parallel coordinate plot(Figure 2-4-1) it seems like ‘oleic’, ‘palmitic’ and ‘eicosenic’ are reasonable to use as influential variables. The reason is ‘eicosenic’ and ‘palmitic’ create two clusters: [North vs South+Sardinia island], and ‘oleic’ differentiate South and Sardinia island well.
As shown in “Figure 2-4-2”, by selecting the influential variables we have chosen, the 3d scatter plot does shown three clusters represent different regions.
In step 4, we have used below operators
Selection operator: Persistent Brushing (Screen space)
Connection operator: Persistent Brushing (Attribute space)
Reconfiguring operator: Drop boxes,Change aesthetics mapping,(Data value
space)
Navigation operator: Camera location in 3D scatter plot (Data value
space)
Filtering operator might be useful for this step as well. Just like part 2, one can filter the range for specific variable and make further analysis on how the region might affect the particular variable.
One strategy we suggest is first colored the oil from different region, and then check the Parallel coordinate plot and see if there are acids that form clusters. Then using 3d-scatter plot to confirm if those acids indeed separate oil from different region. Finally, if those acids did separate oil from different region, go back to Parallel coordinate plot and check the range of the acids level, so when a new observation comes in, we can check the range and know which regions the oils comes from.
knitr::opts_chunk$set(echo = TRUE)
rm(list=ls())
library(plotly)
library(tidytext)
library(dplyr)
library(tidyr)
library(readr)
library(wordcloud)
library(RColorBrewer)
library(visNetwork)
library(crosstalk)
library(GGally)
library(ggplot2)
#plot word cloud corresponding to Five
#read the data line by line
fivetext=read_lines("Five.txt")
#create a data frame and add a new column with row numbers
fivetextFrame=tibble(text=fivetext)%>%mutate(line = row_number())
#create a color palette containing 6 colors
pal <- brewer.pal(6,"Dark2")
#split text column into tokens,one-token-per-row
#return all rows without stop words
#count the frequencies of all the remaining words
#plot a word cloud using wordcloud function
fivetidy_frame=fivetextFrame%>%unnest_tokens(word, text)%>%
anti_join(stop_words)%>%
count(word)%>%
with(wordcloud(word, n, max.words = 100, colors=pal, random.order=F))
#plot word cloud corresponding to OneTwo using the same way
onetwotext=read_lines("OneTwo.txt")
onetwotextFrame=tibble(text=onetwotext)%>%mutate(line = row_number())
pal <- brewer.pal(6,"Dark2")
onetwotidy_frame=onetwotextFrame%>%unnest_tokens(word, text)%>%
anti_join(stop_words)%>%
count(word)%>%
with(wordcloud(word, n, max.words = 100, colors=pal, random.order=F))
#aggregating each 10 lines into a separate “document” using OneTwo data.
onetwotidy_frame2=onetwotextFrame%>%unnest_tokens(word, text)%>%
mutate(line1=floor(line/10))%>%
count(line1,word, sort=TRUE)
#compute mean TF-IDF values
TFIDF=onetwotidy_frame2%>%bind_tf_idf(word, line1, n)
#get the unique words
words=unique(TFIDF$word)
tf_idfmean=c()
#compute mean TF-IDF values for each word over all documents
for(w in words){
n=which(words==w)
tf_idfmean[n]=mean(TFIDF[TFIDF$word==w,]$tf_idf)
}
TF_IDFmean=data.frame(word=words,tf_idfmean=tf_idfmean)
#plot word cloud
TF_IDFmean%>%with(wordcloud(words=word,
freq=tf_idfmean,
scale=c(1,.5),
max.words = 100,
colors=pal,
random.order=F))
#print the rows containing the word "watch"
print(TF_IDFmean[TF_IDFmean$word=="watch",])
print(TFIDF[TFIDF$word=="watch",])
#use Five dataset to plot the aggregated sentiment values versus chunk index
fivesentiment_tidy_frame=fivetextFrame%>%
unnest_tokens(word, text)%>% #Tokenize the text
left_join(get_sentiments("afinn"))%>%#get the sentiment values base on "afinn" lexicon and add them in a new column
mutate(line1=floor(line/5))%>%#create a new column named "line1", the value is the chunk index
group_by(line1, sort=TRUE)%>%#group by line1
summarize(Sentiment=sum(value, na.rm = T))#get the sum sentiment values per chunk index
# plot the aggregated sentiment values versus chunk index
plot_ly(fivesentiment_tidy_frame, x=~line1, y=~Sentiment)%>%
add_bars()%>%# Bar Charts
layout(title = "the aggregated sentiment values using Five ",
xaxis = list(title = 'chunk'))
#use OneTwo dataset to plot the aggregated sentiment values versus chunk index using the same way
onetwosentiment_tidy_frame=onetwotextFrame%>%unnest_tokens(word, text)%>%
left_join(get_sentiments("afinn"))%>%
mutate(line1=floor(line/5))%>%
group_by(line1, sort=TRUE)%>%
summarize(Sentiment=sum(value, na.rm = T))
plot_ly(onetwosentiment_tidy_frame, x=~line1, y=~Sentiment)%>%
add_bars()%>%
layout(title = "the aggregated sentiment values using OneTwo ",
xaxis = list(title = 'chunk'))
#use the codes from the template
phraseNet=function(text, connectors){
textFrame=tibble(text=paste(text, collapse=" "))#return a 1*1 dataframe contains the whole article in it
tidy_frame3=textFrame%>%unnest_tokens(word, text, token="ngrams", n=3)#Tokenize the text, three words per cell
tidy_frame3
tidy_frame_sep=tidy_frame3%>%separate(word, c("word1", "word2", "word3"), sep=" ")#separate "word" column into three columns named "word1", "word2", "word3", one word per cell
#SELECT SEPARATION WORDS HERE: now "is"/"are"
tidy_frame_filtered=tidy_frame_sep%>%
filter(word2 %in% connectors)%>%# filter from word2 column contains connectors
filter(!word1 %in% stop_words$word)%>%# filter from word1 column not contains stop words
filter(!word3 %in% stop_words$word)# filter from word3 column not contains stop words
tidy_frame_filtered
edges=tidy_frame_filtered%>%
count(word1,word3, sort = T)%>%#count the values base on "word1","word3"
rename(from=word1, to=word3, width=n)%>%#rename "n" column using "width"
mutate(arrows="to")#create a new column "arrows", all the values in this column are "to"
right_words=edges%>%count(word=to, wt=width)#computes sum(width) for each word in "to"column
left_words=edges%>%count(word=from, wt=width)#computes sum(width) for each word in "from"column
#Computing node sizes and in/out degrees, colors.
nodes=left_words%>%
full_join(right_words, by="word")%>%#join the two data frame by "word"
replace_na(list(n.x=0, n.y=0))%>%#Replace NAs
mutate(n.total=n.x+n.y)%>%#create a columns to calculate n.x+n.y
mutate(n.out=n.x-n.y)%>%#create a columns to calculate n.x-n.y
mutate(id=word, color=brewer.pal(9, "Blues")[cut_interval(n.out,9)], font.size=40)%>%
rename(label=word, value=n.total)
#FILTERING edges with no further connections - can be commented
edges=edges%>%left_join(nodes, c("from"= "id"))%>%
left_join(nodes, c("to"="id"))%>%
filter(value.x>1|value.y>1)%>%
select(from,to,width,arrows)
nodes=nodes%>%filter(id %in% edges$from |id %in% edges$to )
visNetwork(nodes,edges)
}
text1=read_lines("Five.txt")
text2=read_lines("OneTwo.txt")
#Create the phrase nets these two datasets using different connector words :
connector_words1=c("am", "is", "are", "was", "were")
phraseNet_Five1=phraseNet(text1,connector_words1)
phraseNet_Five1
phraseNet_OneTwo1=phraseNet(text2,connector_words1)
phraseNet_OneTwo1
#change connector words
connector_words2=c("at")
phraseNet_Five2=phraseNet(text1,connector_words2)
phraseNet_Five2
phraseNet_OneTwo2=phraseNet(text2,connector_words2)
phraseNet_OneTwo2
#show the word tree using Five file(only remain the watch part)
knitr::include_graphics("Five_wordtree.png")
#show the word tree using OneTwo file
knitr::include_graphics("OneTwo_wordtree.png")
#Q21
#read the data and preprocess it
olive<-read.csv("olive.csv", row.names=1)
##Create a new column that contains the Region Name
olive <- olive %>%
mutate(Region_Name = case_when(
Region == 1 ~ "North",
Region == 2 ~ "South",
Region == 3 ~ "Sardinia island"
)
)
#create interactive scatter plot
q21_plot <- olive %>% plot_ly(x=~linoleic, y=~eicosenoic,
text = ~paste("Eicosenoic value:", eicosenoic), #Show the Eicosenoic value when hover
type = "scatter",
mode = "markers") %>%
layout(title = "Scatter Plot of Eicosenoic vs. Linoleic",
xaxis = list(title = "Linoleic"),
yaxis = list(title = "Eicosenoic")
)
q21_plot
#Q22
#Template from course website
d <- SharedData$new(olive) # This is needed for crosstalk
scatterCrab_q22 <- d %>%
plot_ly(x=~linoleic, y=~eicosenoic,
text = ~paste("Eicosenoic value:", eicosenoic), #This will show the eicosenoic value when hover
type = "scatter",
mode = "markers")%>%
layout(title = "Eicosenoic vs. Linoleic",
xaxis = list(title = "Linoleic"),
yaxis = list(title = "Eicosenoic"))
#From course template
barCrab_q22 <-plot_ly(d, x=~Region_Name)%>%add_histogram()%>%layout(barmode="overlay")
#Create linked plot
bscols(filter_slider("stearic", "Stearic", d, ~stearic),
subplot(scatterCrab_q22,barCrab_q22,
widths = c(0.7, 0.3), #Let scatter plot have more space
titleY = TRUE, titleX=TRUE) %>% #show axis
highlight (on = "plotly_selected", #use selected to select multiple data point at once
off = "plotly_deselect", #add off action to avoid warnings
dynamic = T, persistent = T, opacityDim = I(1))%>%
hide_legend(),
widths=c(4, 8)) #the width of slider and subplot
#Q23
#Create 2 scatter plots, similar to part 2
scatterCrab_23_eicosenoic <- d %>%
plot_ly(x=~linoleic, y=~eicosenoic,
text = ~paste("Eicosenoic value:", eicosenoic),
type = "scatter",
mode = "markers")%>%
layout(title = "Eicosenoic vs. Linoleic",
xaxis = list(title = "Linoleic"),
yaxis = list(title = "Eicosenoic"))
scatterCrab_23_arachidic <- d %>%
plot_ly(x=~linolenic, y=~arachidic,
type = "scatter",
mode = "markers")%>%
layout(title = "Arachidic vs. Linolenic",
xaxis = list(title = "Linolenic"),
yaxis = list(title = "Arachidic"))
bscols(subplot(scatterCrab_23_eicosenoic, scatterCrab_23_arachidic, titleY = TRUE, titleX=TRUE) %>%
layout(title="") %>%
highlight(
on = "plotly_selected", #use selected to select multiple data point at once
off = "plotly_deselect",
dynamic = T, persistent = T, opacityDim = I(1)
)%>%
hide_legend())
knitr::include_graphics("Q23.png")
#Q24
#Code modified from course website
#Parallel plot
p<-ggparcoord(olive, columns = c(3:10)) #Choose the column of acids
q24_plotly_data<-plotly_data(ggplotly(p))%>%group_by(.ID)
d1<-SharedData$new(q24_plotly_data, key =~.ID, group="q24") #Key and Group are crucial for linking the plots
parallel_plot<-plot_ly(d1, x=~variable, y=~value)%>%
add_lines(line=list(width=0.3))%>%
add_markers(marker=list(size=0.3),
text=~.ID, hoverinfo="text")
olive2=olive #Create new data frame so we don't modified on the original one
olive2$.ID=1:nrow(olive) #add .ID coulumn
d2<-SharedData$new(olive2, ~.ID, group="q24") #use .ID as grouping identifier
## Variable selection, from course website
ButtonsX=list()
for (i in 3:10){ #modified the indices so we choose the acid columns
ButtonsX[[i-2]]= list(method = "restyle",
args = list( "x", list(olive2[[i]])),
label = colnames(olive2)[i])
}
ButtonsY=list()
for (i in 3:10){
ButtonsY[[i-2]]= list(method = "restyle",
args = list( "y", list(olive2[[i]])),
label = colnames(olive2)[i])
}
ButtonsZ=list()
for (i in 3:10){
ButtonsZ[[i-2]]= list(method = "restyle",
args = list( "z", list(olive2[[i]])),
label = colnames(olive2)[i])
}
## 3D plot, from course website
three_d_plot <- plot_ly(d2,x=~palmitic,y=~palmitic,z=~palmitic) %>%#set the default box as palmitic on all axis
add_markers() %>%
layout(scene=list(xaxis=list(title="x"), yaxis=list(title="y"), zaxis=list(title="z")),title = "",
updatemenus = list(
list(y=0.9, buttons = ButtonsX), #y is the position of drop boxes
list(y=0.6, buttons = ButtonsY),
list(y=0.3, buttons = ButtonsZ)
) )
histogram_plot <- plot_ly(d2, x=~as.factor(Region_Name)) %>% add_histogram() %>%
layout(barmode="overlay", xaxis = list(title="Region"))
bscols(
parallel_plot %>% highlight(
on = "plotly_selected",
off = "plotly_deselect",
dynamic = TRUE,
persistent = TRUE,
opacityDim = I(1)
) %>%
hide_legend(),
three_d_plot %>% highlight(
on = "plotly_click",
off = "plotly_doubleclick",
dynamic = TRUE,
persistent = TRUE
) %>%
hide_legend(),
histogram_plot %>% highlight(
on = "plotly_click",
off = "plotly_doubleclick",
dynamic = TRUE,
persistent = TRUE
),
widths=c(4,4,4)) #width for each subplot
knitr::include_graphics("Q24_par.png")
knitr::include_graphics("Q24_3d.png")